{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The autoreload extension is already loaded. To reload it, use:\n",
      "  %reload_ext autoreload\n"
     ]
    }
   ],
   "source": [
    "# NOTE: most of the commented-out code was executed in a terminal and/or \n",
    "# datasets.py and subsequently pasted in here; consequently, it probably\n",
    "# won't work out of the box.\n",
    "\n",
    "import os\n",
    "import numpy as np\n",
    "from scipy.io import loadmat\n",
    "\n",
    "%load_ext autoreload\n",
    "\n",
    "%aimport utils\n",
    "%autoreload 1  # 1 = autoreload only modules imported with %aimport\n",
    "# %autoreload 2\n",
    "\n",
    "DATA_DIR = os.path.expanduser(\"~/Desktop/datasets/nn-search/\")\n",
    "\n",
    "contig = np.ascontiguousarray\n",
    "join = os.path.join\n",
    "\n",
    "def save_to_data_dir(relative_path, X):\n",
    "    np.save(join(DATA_DIR, relative_path), np.ascontiguousarray(X))\n",
    "\n",
    "def load_from_data_dir(relative_path):\n",
    "    return np.load(join(DATA_DIR, relative_path))\n",
    "\n",
    "def loadmat_from_data_dir(relative_path):\n",
    "    return loadmat(join(DATA_DIR, relative_path))\n",
    "\n",
    "def read_yael_vecs(path, c_contiguous=True, limit_rows=-1, dtype=None):\n",
    "    dim = np.fromfile(path, dtype=np.int32, count=2)[0]\n",
    "    print \"vector length = {}\".format(dim)\n",
    "\n",
    "    if dtype is None:\n",
    "        if 'fvecs' in path:\n",
    "            dtype = np.float32\n",
    "        elif 'ivecs' in path:\n",
    "            dtype = np.int32\n",
    "        elif 'bvecs' in path:\n",
    "            dtype = np.uint8\n",
    "        else:\n",
    "            raise ValueError(\"couldn't infer dtype from path {}\".format(path))\n",
    "    itemsize = np.dtype(dtype).itemsize\n",
    "\n",
    "    assert dim > 0\n",
    "    assert itemsize in (1, 2, 4)\n",
    "\n",
    "    cols_for_dim = 4 // itemsize\n",
    "    row_size_bytes = 4 + dim * itemsize\n",
    "    row_size_elems = row_size_bytes // itemsize\n",
    "    limit = int(limit_rows) * row_size_elems if limit_rows > 0 else -1\n",
    "\n",
    "    fv = np.fromfile(path, dtype=dtype, count=limit)\n",
    "    fv = fv.reshape((-1, row_size_elems))\n",
    "\n",
    "    if not all(fv.view(np.int32)[:, 0] == dim):\n",
    "        raise IOError(\"Non-uniform vector sizes in \" + path)\n",
    "\n",
    "    fv = fv[:, cols_for_dim:]\n",
    "\n",
    "    if c_contiguous:\n",
    "        fv = fv.copy()\n",
    "    return fv\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Gist1M"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "# # ------------------------ clean up gist (now broken)\n",
    "# path = Paths.DATA_DIR + '/gist/gist_base.fvecs'\n",
    "# path = Paths.DATA_DIR + '/gist/gist_learn.fvecs'\n",
    "# path = Paths.DATA_DIR + '/gist/gist_queries.fvecs'\n",
    "# path = Paths.DATA_DIR + '/gist/gist_groundtruth.ivecs'\n",
    "# out_path = Paths.GIST_100\n",
    "# out_path = Paths.GIST_200\n",
    "# out_path = Paths.GIST_TRAIN\n",
    "# out_path = Paths.GIST_QUERIES\n",
    "# out_path = Paths.GIST_TRUTH\n",
    "# X = read_yael_vecs(path)[:100000]\n",
    "# X = read_yael_vecs(path)[:200000]\n",
    "# X = read_yael_vecs(path)\n",
    "# X = read_yael_vecs(path, dtype=np.int32)\n",
    "# print X[:2]\n",
    "# print X.shape\n",
    "# np.save(out_path, X)\n",
    "# truth_dir = data_dir + 'gnd/'\n",
    "# # truth_idxs_files = ['idx_1M', 'idx_10M', 'idx_100M']\n",
    "# truth_idxs_files = ['idx_1000M']\n",
    "# for f in truth_idxs_files:\n",
    "#     path = truth_dir + f + '.ivecs'\n",
    "#     out_path = out_dir + f + '.npy'\n",
    "#     print \"unpacking {} to {}\".format(path, out_path)\n",
    "#     X = read_yael_vecs(path)\n",
    "#     print X.shape\n",
    "#     np.save(out_path, X)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Sift1M"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "# ------------------------ clean up sift1m\n",
    "# data_dir = '/Volumes/MacHDD/datasets/sift1m/'\n",
    "# out_dir = '/Volumes/MacSSD_OS/Users/davis/Desktop/datasets/sift1m/'\n",
    "# for fname in os.listdir(data_dir):\n",
    "#     in_path = data_dir + fname\n",
    "#     out_path = out_dir + fname.split('.')[0] + '.npy'\n",
    "#     print \"unpacking {} to {}\".format(in_path, out_path)\n",
    "#     X = read_yael_vecs(in_path)\n",
    "#     print X.shape\n",
    "#     np.save(out_path, X)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Deep1M"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(300000, 256)\n",
      "False\n",
      "(1000, 256)\n",
      "False\n"
     ]
    }
   ],
   "source": [
    "# # ------------------------ clean up Deep1M\n",
    "# data_dir = os.path.expanduser('~/Desktop/datasets/nn-search/deep1M-raw/')\n",
    "# out_dir = os.path.expanduser('~/Desktop/datasets/nn-search/deep1M/')\n",
    "# print \"in dir, out dir:\", data_dir, out_dir\n",
    "# for fname in os.listdir(data_dir):\n",
    "#     in_path = data_dir + fname\n",
    "#     out_path = out_dir + fname.split('.')[0] + '.npy'\n",
    "#     print \"unpacking {} to {}\".format(in_path, out_path)\n",
    "#     X = read_yael_vecs(in_path)\n",
    "#     print X.shape\n",
    "#     np.save(out_path, X)\n",
    "\n",
    "X_train = load_from_data_dir('deep1m/deep1M_learn.npy')\n",
    "print X_train.shape\n",
    "print np.isfortran(X_train) # false\n",
    "\n",
    "Q = load_from_data_dir('deep1m/deep1M_queries.npy')\n",
    "print Q.shape\n",
    "print np.isfortran(Q) # false"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "computing top k for query block 0 (queries 0-128)...\n",
      "computing top k for query block 5 (queries 640-768)...\n",
      "done\n",
      "(1000, 1000)\n",
      "int32\n"
     ]
    }
   ],
   "source": [
    "truth_train = utils.compute_true_knn(X_train, Q)\n",
    "print truth_train.shape\n",
    "print truth_train.dtype"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {},
   "outputs": [],
   "source": [
    "save_to_data_dir('deep1m/deep1M_truth_train.npy', truth_train.astype(np.int32))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 74,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(1000000, 256)\n",
      "False\n"
     ]
    }
   ],
   "source": [
    "X_test = load_from_data_dir('deep1m/deep1M_base.npy')\n",
    "print X_test.shape\n",
    "print np.isfortran(X_test) # false"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "computing top k for query block 0 (queries 0-128)...\n",
      "computing top k for query block 5 (queries 640-768)...\n",
      "done\n",
      "(1000, 1000)\n",
      "int32\n"
     ]
    }
   ],
   "source": [
    "truth_test = utils.compute_true_knn(X_test, Q)\n",
    "print truth_test.shape\n",
    "print truth_test.dtype"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 65,
   "metadata": {},
   "outputs": [],
   "source": [
    "save_to_data_dir('deep1m/deep1M_truth_test.npy', truth_test.astype(np.int32))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 75,
   "metadata": {},
   "outputs": [],
   "source": [
    "save_to_data_dir('deep1m/deep1m_test_100k.npy', X_test[:100000, :])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Convnet 1M"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "False\n",
      "(100000, 128)\n"
     ]
    }
   ],
   "source": [
    "# # ------------------------ clean up Convnet1M\n",
    "# import numpy as np\n",
    "# from scipy.io import loadmat\n",
    "# d = loadmat('features_m_128.mat')\n",
    "# contig = np.ascontiguousarray\n",
    "# savedir = '../convnet1m/'\n",
    "# np.save(savedir + 'convnet_train.npy', contig(d['feats_m_128_train']))\n",
    "# np.save(savedir + 'convnet_test.npy', contig(d['feats_m_128_test']))\n",
    "# np.save(savedir + 'convnet_base.npy', contig(d['feats_m_128_base']))\n",
    "\n",
    "X_train = load_from_data_dir('convnet1m/convnet_train.npy')\n",
    "print np.isfortran(X_train) # false\n",
    "print X_train.shape\n",
    "# if np.isfortran(X_train):\n",
    "#     save_to_data_dir('convnet1m/convnet_train.npy', X_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(10000, 128)\n",
      "False\n"
     ]
    }
   ],
   "source": [
    "Q = load_from_data_dir('convnet1m/convnet_queries.npy')\n",
    "print Q.shape\n",
    "print np.isfortran(Q)\n",
    "# save_to_data_dir('convnet1m/convnet_queries.npy', Q)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "# print Q[:20, :20]  yep, looks like relu activations\n",
    "# dists = utils.sq_dists_to_vectors(Q, X_train)\n",
    "# print dists.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 71,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "10000\n",
      "computing top k for query 0...\n",
      "computing top k for query 500...\n",
      "computing top k for query 1000...\n",
      "computing top k for query 1500...\n",
      "computing top k for query 2000...\n",
      "computing top k for query 2500...\n",
      "computing top k for query 3000...\n",
      "computing top k for query 3500...\n",
      "computing top k for query 4000...\n",
      "computing top k for query 4500...\n",
      "computing top k for query 5000...\n",
      "computing top k for query 5500...\n",
      "computing top k for query 6000...\n",
      "computing top k for query 6500...\n",
      "computing top k for query 7000...\n",
      "computing top k for query 7500...\n",
      "computing top k for query 8000...\n",
      "computing top k for query 8500...\n",
      "computing top k for query 9000...\n",
      "computing top k for query 9500...\n"
     ]
    }
   ],
   "source": [
    "nqueries = Q.shape[0]\n",
    "k = 1000\n",
    "truth_train = np.full((nqueries, k), -999, dtype=np.int32)\n",
    "print nqueries\n",
    "# for i in range(100):\n",
    "for i in range(nqueries):\n",
    "    if i % 1000 == 0:\n",
    "        print \"computing top k for query {}...\".format(i)\n",
    "    truth_train[i, :] = utils.top_k_idxs(dists[i, :], k)\n",
    "print \"done\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 74,
   "metadata": {},
   "outputs": [],
   "source": [
    "assert not np.any(truth_train == -999)\n",
    "save_to_data_dir('convnet1m/truth_train.npy', truth_train.astype(np.int32))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 71,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(1000000, 128)\n",
      "False\n"
     ]
    }
   ],
   "source": [
    "X_test = load_from_data_dir('convnet1m/convnet_test.npy')\n",
    "print X_test.shape\n",
    "print np.isfortran(X_test)\n",
    "# if np.isfortran(X_test):\n",
    "#     save_to_data_dir('convnet1m/convnet_test.npy', X_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "WARNING: sq_dists_to_vectors: attempting to create a matrixof size 128000000\n",
      "computing top k for query block 0 (queries 0-128)...\n",
      "WARNING: sq_dists_to_vectors: attempting to create a matrixof size 128000000\n",
      "WARNING: sq_dists_to_vectors: attempting to create a matrixof size 128000000\n",
      "WARNING: sq_dists_to_vectors: attempting to create a matrixof size 128000000\n",
      "WARNING: sq_dists_to_vectors: attempting to create a matrixof size 128000000\n",
      "WARNING: sq_dists_to_vectors: attempting to create a matrixof size 128000000\n",
      "computing top k for query block 5 (queries 640-768)...\n",
      "WARNING: sq_dists_to_vectors: attempting to create a matrixof size 128000000\n",
      "WARNING: sq_dists_to_vectors: attempting to create a matrixof size 128000000\n",
      "WARNING: sq_dists_to_vectors: attempting to create a matrixof size 128000000\n",
      "WARNING: sq_dists_to_vectors: attempting to create a matrixof size 128000000\n",
      "WARNING: sq_dists_to_vectors: attempting to create a matrixof size 128000000\n",
      "computing top k for query block 10 (queries 1280-1408)...\n",
      "WARNING: sq_dists_to_vectors: attempting to create a matrixof size 128000000\n",
      "WARNING: sq_dists_to_vectors: attempting to create a matrixof size 128000000\n",
      "WARNING: sq_dists_to_vectors: attempting to create a matrixof size 128000000\n",
      "WARNING: sq_dists_to_vectors: attempting to create a matrixof size 128000000\n",
      "WARNING: sq_dists_to_vectors: attempting to create a matrixof size 128000000\n",
      "computing top k for query block 15 (queries 1920-2048)...\n",
      "WARNING: sq_dists_to_vectors: attempting to create a matrixof size 128000000\n",
      "WARNING: sq_dists_to_vectors: attempting to create a matrixof size 128000000\n",
      "WARNING: sq_dists_to_vectors: attempting to create a matrixof size 128000000\n",
      "WARNING: sq_dists_to_vectors: attempting to create a matrixof size 128000000\n",
      "WARNING: sq_dists_to_vectors: attempting to create a matrixof size 128000000\n",
      "computing top k for query block 20 (queries 2560-2688)...\n",
      "WARNING: sq_dists_to_vectors: attempting to create a matrixof size 128000000\n",
      "WARNING: sq_dists_to_vectors: attempting to create a matrixof size 128000000\n",
      "WARNING: sq_dists_to_vectors: attempting to create a matrixof size 128000000\n",
      "WARNING: sq_dists_to_vectors: attempting to create a matrixof size 128000000\n",
      "WARNING: sq_dists_to_vectors: attempting to create a matrixof size 128000000\n",
      "computing top k for query block 25 (queries 3200-3328)...\n",
      "WARNING: sq_dists_to_vectors: attempting to create a matrixof size 128000000\n",
      "WARNING: sq_dists_to_vectors: attempting to create a matrixof size 128000000\n",
      "WARNING: sq_dists_to_vectors: attempting to create a matrixof size 128000000\n",
      "WARNING: sq_dists_to_vectors: attempting to create a matrixof size 128000000\n",
      "WARNING: sq_dists_to_vectors: attempting to create a matrixof size 128000000\n",
      "computing top k for query block 30 (queries 3840-3968)...\n",
      "WARNING: sq_dists_to_vectors: attempting to create a matrixof size 128000000\n",
      "WARNING: sq_dists_to_vectors: attempting to create a matrixof size 128000000\n",
      "WARNING: sq_dists_to_vectors: attempting to create a matrixof size 128000000\n",
      "WARNING: sq_dists_to_vectors: attempting to create a matrixof size 128000000\n",
      "WARNING: sq_dists_to_vectors: attempting to create a matrixof size 128000000\n",
      "computing top k for query block 35 (queries 4480-4608)...\n",
      "WARNING: sq_dists_to_vectors: attempting to create a matrixof size 128000000\n",
      "WARNING: sq_dists_to_vectors: attempting to create a matrixof size 128000000\n",
      "WARNING: sq_dists_to_vectors: attempting to create a matrixof size 128000000\n",
      "WARNING: sq_dists_to_vectors: attempting to create a matrixof size 128000000\n",
      "WARNING: sq_dists_to_vectors: attempting to create a matrixof size 128000000\n",
      "computing top k for query block 40 (queries 5120-5248)...\n",
      "WARNING: sq_dists_to_vectors: attempting to create a matrixof size 128000000\n",
      "WARNING: sq_dists_to_vectors: attempting to create a matrixof size 128000000\n",
      "WARNING: sq_dists_to_vectors: attempting to create a matrixof size 128000000\n",
      "WARNING: sq_dists_to_vectors: attempting to create a matrixof size 128000000\n",
      "WARNING: sq_dists_to_vectors: attempting to create a matrixof size 128000000\n",
      "computing top k for query block 45 (queries 5760-5888)...\n",
      "WARNING: sq_dists_to_vectors: attempting to create a matrixof size 128000000\n",
      "WARNING: sq_dists_to_vectors: attempting to create a matrixof size 128000000\n",
      "WARNING: sq_dists_to_vectors: attempting to create a matrixof size 128000000\n",
      "WARNING: sq_dists_to_vectors: attempting to create a matrixof size 128000000\n",
      "WARNING: sq_dists_to_vectors: attempting to create a matrixof size 128000000\n",
      "computing top k for query block 50 (queries 6400-6528)...\n",
      "WARNING: sq_dists_to_vectors: attempting to create a matrixof size 128000000\n",
      "WARNING: sq_dists_to_vectors: attempting to create a matrixof size 128000000\n",
      "WARNING: sq_dists_to_vectors: attempting to create a matrixof size 128000000\n",
      "WARNING: sq_dists_to_vectors: attempting to create a matrixof size 128000000\n",
      "WARNING: sq_dists_to_vectors: attempting to create a matrixof size 128000000\n",
      "computing top k for query block 55 (queries 7040-7168)...\n",
      "WARNING: sq_dists_to_vectors: attempting to create a matrixof size 128000000\n",
      "WARNING: sq_dists_to_vectors: attempting to create a matrixof size 128000000\n",
      "WARNING: sq_dists_to_vectors: attempting to create a matrixof size 128000000\n",
      "WARNING: sq_dists_to_vectors: attempting to create a matrixof size 128000000\n",
      "WARNING: sq_dists_to_vectors: attempting to create a matrixof size 128000000\n",
      "computing top k for query block 60 (queries 7680-7808)...\n",
      "WARNING: sq_dists_to_vectors: attempting to create a matrixof size 128000000\n",
      "WARNING: sq_dists_to_vectors: attempting to create a matrixof size 128000000\n",
      "WARNING: sq_dists_to_vectors: attempting to create a matrixof size 128000000\n",
      "WARNING: sq_dists_to_vectors: attempting to create a matrixof size 128000000\n",
      "WARNING: sq_dists_to_vectors: attempting to create a matrixof size 128000000\n",
      "computing top k for query block 65 (queries 8320-8448)...\n",
      "WARNING: sq_dists_to_vectors: attempting to create a matrixof size 128000000\n",
      "WARNING: sq_dists_to_vectors: attempting to create a matrixof size 128000000\n",
      "WARNING: sq_dists_to_vectors: attempting to create a matrixof size 128000000\n",
      "WARNING: sq_dists_to_vectors: attempting to create a matrixof size 128000000\n",
      "WARNING: sq_dists_to_vectors: attempting to create a matrixof size 128000000\n",
      "computing top k for query block 70 (queries 8960-9088)...\n",
      "WARNING: sq_dists_to_vectors: attempting to create a matrixof size 128000000\n",
      "WARNING: sq_dists_to_vectors: attempting to create a matrixof size 128000000\n",
      "WARNING: sq_dists_to_vectors: attempting to create a matrixof size 128000000\n",
      "WARNING: sq_dists_to_vectors: attempting to create a matrixof size 128000000\n",
      "WARNING: sq_dists_to_vectors: attempting to create a matrixof size 128000000\n",
      "computing top k for query block 75 (queries 9600-9728)...\n",
      "WARNING: sq_dists_to_vectors: attempting to create a matrixof size 128000000\n",
      "WARNING: sq_dists_to_vectors: attempting to create a matrixof size 128000000\n",
      "done\n",
      "(10000, 1000)\n",
      "int32\n"
     ]
    }
   ],
   "source": [
    "# # truth_test = utils.compute_true_knn(X_test[:1000], Q[:130], block_sz=128)\n",
    "# truth_test = utils.compute_true_knn(X_test, Q)\n",
    "# print truth_test.shape\n",
    "# print truth_test.dtype"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [],
   "source": [
    "assert not np.any(truth_test == -999)\n",
    "save_to_data_dir('convnet1m/truth_test.npy', truth_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 73,
   "metadata": {},
   "outputs": [],
   "source": [
    "save_to_data_dir('convnet1m/convnet_test_100k.npy', X_test[:100000, :])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Deep1B"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "# ------------------------ clean up deep1b\n",
    "# data_dir = '/Volumes/MacHDD/datasets/deep1b/'\n",
    "# out_dir = '/Volumes/MacSSD_OS/Users/davis/Desktop/datasets/deep1b/'\n",
    "\n",
    "# # expected_cols = 96\n",
    "# # equivalent_elements_in_first_1M = int(1e6) * (1 + expected_cols)\n",
    "# arrays = []\n",
    "# # arrays.append(('deep1B_queries.fvecs', 'deep_queries.npy', -1))\n",
    "# # arrays.append(('deep1B_groundtruth.ivecs', 'deep_true_nn_idxs.npy', -1))\n",
    "# # arrays.append(('deep10M.fvecs', 'deep_1M.npy', 1e6))\n",
    "# arrays.append(('deep10M.fvecs', 'deep_10M.npy', -1))\n",
    "# for in_file, out_file, limit in arrays:\n",
    "#     in_path = data_dir + in_file\n",
    "#     out_path = out_dir + out_file\n",
    "#     X = read_yael_vecs(in_path, limit_rows=limit)\n",
    "#     print \"unpacking {} to {}\".format(in_path, out_path)\n",
    "#     print X.shape\n",
    "#     np.save(out_path, X)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## LabelMe"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# ------------------------ clean up LabelMe\n",
    "# >>> from scipy.io import loadmat\n",
    "# >>> d = loadmat('LabelMe_gist.mat')\n",
    "# >>> for k, v in d.iteritems():\n",
    "# ...     try:\n",
    "# ...             print k, v.shape\n",
    "# ...     except:\n",
    "# ...             pass\n",
    "# ...\n",
    "# gist (22019, 512)\n",
    "# img (32, 32, 3, 22019)\n",
    "# nmat (1000, 1000, 20)\n",
    "# __header__ param (1, 1)\n",
    "# __globals__ seg (32, 32, 22019)\n",
    "# names (1, 3597)\n",
    "# DistLM (22019, 22019)\n",
    "# __version__ ndxtrain (1, 20019)\n",
    "# ndxtest (1, 2000)\n",
    "#\n",
    "# okay, no idea what most of these are even with the readme...\n",
    "#\n",
    "# >>> np.save('labelme_train_idxs', d['ndxtrain']) # training data idxs\n",
    "# >>> np.save('labelme_test_idxs', d['ndxtest'])   # test data idxs\n",
    "# >>> np.save('labelme_all_gists', d['gist'])     # actual gist descriptors\n",
    "\n",
    "X = load_from_data_dir('labelme/labelme_all_gists.npy')\n",
    "np.isfortran(X)  # this is true; suggests lots of other stuff also F order"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(20019,)\n",
      "(2000,)\n"
     ]
    }
   ],
   "source": [
    "train_idxs = load_from_data_dir('labelme/labelme_train_idxs.npy').ravel() - 1 # one-indexed\n",
    "X_train = X[train_idxs, :]\n",
    "# save_to_data_dir('labelme/labelme_train.npy', X_train)\n",
    "\n",
    "test_idxs = load_from_data_dir('labelme/labelme_test_idxs.npy').ravel() - 1 # one-indexed\n",
    "X_test = X[test_idxs, :]\n",
    "# save_to_data_dir('labelme/labelme_test.npy', X_test)\n",
    "\n",
    "print train_idxs.shape\n",
    "print test_idxs.shape\n",
    "\n",
    "train_set = set(list(train_idxs))\n",
    "test_set = set(list(test_idxs))\n",
    "assert len(train_set.intersection(test_set)) == 0\n",
    "assert len(train_set) + len(test_set) == len(X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "d = loadmat_from_data_dir('labelme/LabelMe_gist.mat')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['gist', 'img', 'nmat', '__header__', 'param', '__globals__', 'seg', 'names', 'DistLM', '__version__', 'ndxtrain', 'ndxtest']\n",
      "(22019, 512)\n",
      "(32, 32, 22019)\n",
      "(1000, 1000, 20)\n",
      "(22019, 22019)\n",
      "836\n"
     ]
    }
   ],
   "source": [
    "print d.keys()\n",
    "print d['gist'].shape\n",
    "print d['seg'].shape\n",
    "print d['nmat'].shape\n",
    "print d['DistLM'].shape\n",
    "print (d['DistLM'] != 0).sum() / 22019\n",
    "\n",
    "# alright, ya, there don't seem to be predefined queries or true neighbors for them"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {},
   "outputs": [],
   "source": [
    "# dists = utils.sq_dists_to_vectors(X, X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {},
   "outputs": [],
   "source": [
    "# truth = np.argsort(dists, axis=1)[:1000]\n",
    "# save_to_data_dir('labelme/labelme_truth.npy', truth)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(20019, 512)\n",
      "False\n"
     ]
    }
   ],
   "source": [
    "X_train = load_from_data_dir('labelme/labelme_train.npy')\n",
    "print X_train.shape\n",
    "print np.isfortran(X_train) # false"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(2000, 512)\n",
      "False\n"
     ]
    }
   ],
   "source": [
    "X_test = load_from_data_dir('labelme/labelme_test.npy')\n",
    "print X_test.shape\n",
    "print np.isfortran(X_test)  # false"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "computing top k for query block 0 (queries 0-128)...\n",
      "computing top k for query block 5 (queries 640-768)...\n",
      "computing top k for query block 10 (queries 1280-1408)...\n",
      "computing top k for query block 15 (queries 1920-2000)...\n",
      "done\n"
     ]
    }
   ],
   "source": [
    "truth = utils.compute_true_knn(X_train, X_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[[ 0.04996838  0.08774966  0.04675926  0.06643171  0.01966016]\n",
      " [ 0.0075171   0.0136047   0.02036818  0.01539837  0.00822605]\n",
      " [ 0.01428436  0.04946869  0.03160411  0.05185791  0.00135769]\n",
      " [ 0.00696543  0.0505463   0.09298885  0.04827282  0.00205458]\n",
      " [ 0.01164107  0.04647382  0.06512821  0.03711114  0.00478815]]\n",
      "[[ 0.03001729  0.03533581  0.02367221  0.00753544  0.00722851]\n",
      " [ 0.05340568  0.08936567  0.12642182  0.07891258  0.11755066]\n",
      " [ 0.05266207  0.06291605  0.04678894  0.05438183  0.00601782]\n",
      " [ 0.03347877  0.07613105  0.14790234  0.008575    0.04197339]\n",
      " [ 0.04045155  0.0408119   0.07632685  0.10935131  0.02766959]]\n",
      "[ 9259  4870 13950 14336  8944 10479  1577 14127 10323  3627]\n",
      "[ 9259  4870 13950 14336  8944 10479  1577 14127 10323  3627]\n",
      "[ 8889  8046 11823  4267 16831 14420 12037  2076  8762  9254]\n",
      "[ 8889  8046 11823  4267 16831 14420 12037  2076  8762  9254]\n",
      "[ 1143  1136  1138  9057 16789  1146  2066  1148  1649  2062]\n",
      "[ 1143  1136  1138  9057 16789  1146  2066  1148  1649  2062]\n",
      "[ 4439  6094  4343 18569 18608 18654  4435  1055  6451  4446]\n",
      "[ 4439  6094  4343 18569 18608 18654  4435  1055  6451  4446]\n",
      "[16583  6164 16613 14143 16943  7424 10617 10078 10656 13140]\n",
      "[16583  6164 16613 14143 16943  7424 10617 10078 10656 13140]\n"
     ]
    }
   ],
   "source": [
    "nrows = 5\n",
    "X = X_train\n",
    "Q = X_test\n",
    "print X[:5, :5]\n",
    "print Q[:5, :5]\n",
    "for i in range(nrows):\n",
    "    dists = utils.dists_sq(X, Q[i, :])\n",
    "    print np.argsort(dists, axis=-1)[:10]\n",
    "    print truth[i, :10]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(2000, 1000)\n",
      "False\n"
     ]
    }
   ],
   "source": [
    "print truth.shape\n",
    "print np.isfortran(truth)\n",
    "save_to_data_dir('labelme/labelme_truth.npy', truth)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[ 9259  4870 13950 14336  8944 10479  1577 14127 10323  3627 13380 10476\n",
      " 14664 13382 17015  1023 15942 10431 10440  8320]\n",
      "[ 9259  4870 13950 14336  8944 10479  1577 14127 10323  3627 13380 10476\n",
      " 14664 13382 17015  1023 15942 10431 10440  8320]\n"
     ]
    }
   ],
   "source": [
    "print truth[0, :20]\n",
    "dists = utils.dists_sq(X_train, X_test[0])\n",
    "print np.argsort(dists)[:20]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## MNIST"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [],
   "source": [
    "# ------------------------ clean up mnist\n",
    "import mnist\n",
    "loader = mnist.MNIST(DATA_DIR + 'mnist/')\n",
    "X_train, Y_train = loader.load_training()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "............................\n",
      "............................\n",
      "............................\n",
      "............................\n",
      "............................\n",
      "...................@@.......\n",
      "..................@@@.......\n",
      "..................@@@.......\n",
      ".................@@@........\n",
      "................@@@.........\n",
      "................@@..........\n",
      "...............@@@..........\n",
      "..............@@@...........\n",
      "..............@@............\n",
      ".............@@.............\n",
      "............@@@.............\n",
      "............@@@.............\n",
      "...........@@@..............\n",
      "..........@@@...............\n",
      "..........@@@...............\n",
      "..........@@@...............\n",
      ".........@@@................\n",
      ".........@@@................\n",
      ".........@@@................\n",
      "..........@@................\n",
      "............................\n",
      "............................\n",
      "............................\n",
      "5\n"
     ]
    }
   ],
   "source": [
    "X_train_2d = np.concatenate(X_train, axis=0)\n",
    "X_train_2d = X_train_2d.reshape((-1, 784))\n",
    "print mnist.MNIST.display(X_train[3])\n",
    "print Y_train[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<type 'array.array'>\n"
     ]
    }
   ],
   "source": [
    "print type(Y_train)\n",
    "save_to_data_dir('mnist/X_train.npy', X_train_2d.astype(np.float32))\n",
    "save_to_data_dir('mnist/Y_train.npy', Y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_test, Y_test = loader.load_testing()\n",
    "X_test_2d = np.concatenate(X_test, axis=0)\n",
    "X_test_2d = X_test_2d.reshape((-1, 784))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {},
   "outputs": [],
   "source": [
    "save_to_data_dir('mnist/X_test.npy', X_test_2d.astype(np.float32))\n",
    "save_to_data_dir('mnist/Y_test.npy', Y_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(10000, 784)\n",
      "0\n",
      "255\n"
     ]
    }
   ],
   "source": [
    "print X_test_2d.shape\n",
    "print np.min(X_test)\n",
    "print np.max(X_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "computing true euclidean distances...\n",
      "computing top k for query 0...\n",
      "computing top k for query 1000...\n",
      "computing top k for query 2000...\n",
      "computing top k for query 3000...\n",
      "computing top k for query 4000...\n",
      "computing top k for query 5000...\n",
      "computing top k for query 6000...\n",
      "computing top k for query 7000...\n",
      "computing top k for query 8000...\n",
      "computing top k for query 9000...\n",
      "done\n"
     ]
    }
   ],
   "source": [
    "# like other papers, treat test set as queries, train as database\n",
    "truth = utils.compute_true_knn(X_train_2d, X_test_2d)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [],
   "source": [
    "# save_to_data_dir('mnist/truth_Q=train_X=test.npy', truth)\n",
    "save_to_data_dir('mnist/truth_Q=test_X=train.npy', truth)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Glove"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 68,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(1193514, 100)\n",
      "False\n"
     ]
    }
   ],
   "source": [
    "X = np.loadtxt(DATA_DIR + 'glove/glove.txt')\n",
    "print X.shape\n",
    "print np.isfortran(X) # false\n",
    "\n",
    "Q, X_test = X[:10000], X[10000:]\n",
    "save_to_data_dir('glove/glove_queries.npy', Q)\n",
    "save_to_data_dir('glove/glove_test.npy', X_test)\n",
    "\n",
    "# Q = load_from_data_dir('deep1m/deep1M_queries.npy')\n",
    "# print Q.shape\n",
    "# print np.isfortran(Q) # false"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 69,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "computing top k for query block 0 (queries 0-128)...\n",
      "computing top k for query block 5 (queries 640-768)...\n",
      "computing top k for query block 10 (queries 1280-1408)...\n",
      "computing top k for query block 15 (queries 1920-2048)...\n",
      "computing top k for query block 20 (queries 2560-2688)...\n",
      "computing top k for query block 25 (queries 3200-3328)...\n",
      "computing top k for query block 30 (queries 3840-3968)...\n",
      "computing top k for query block 35 (queries 4480-4608)...\n",
      "computing top k for query block 40 (queries 5120-5248)...\n",
      "computing top k for query block 45 (queries 5760-5888)...\n",
      "computing top k for query block 50 (queries 6400-6528)...\n",
      "computing top k for query block 55 (queries 7040-7168)...\n",
      "computing top k for query block 60 (queries 7680-7808)...\n",
      "computing top k for query block 65 (queries 8320-8448)...\n",
      "computing top k for query block 70 (queries 8960-9088)...\n",
      "computing top k for query block 75 (queries 9600-9728)...\n",
      "done\n",
      "int32\n",
      "(10000, 1000)\n"
     ]
    }
   ],
   "source": [
    "truth = utils.compute_true_knn(X_test, Q)\n",
    "print truth.dtype\n",
    "print truth.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 70,
   "metadata": {},
   "outputs": [],
   "source": [
    "save_to_data_dir('glove/truth.npy', truth)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
